Introduction

# Introduction: The primary goal of this assignment is to analyse a campaign of 'Certified Term deposit' conducted by a bank for its client. We explore the success of this campiagn by analysing various data exploration techniques such as visualization, statistics and comparison.  
# We also use decision tree modelling to derive a strategy to reach a particular goal i.e. predict and interpret if a client will subscribe to a 'Certified term deposit(y)'.

1.Set up, import and inspect the data:

A.

# All the libraries needed to achieve our goal:
library(rmarkdown)
library(psych)
library(scatterplot3d)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
library(C50)
library(rminer)
## Warning: namespace 'dimRed' is not available and has been replaced
## by .GlobalEnv when processing object ''

## Warning: namespace 'dimRed' is not available and has been replaced
## by .GlobalEnv when processing object ''

## Warning: namespace 'dimRed' is not available and has been replaced
## by .GlobalEnv when processing object ''
bank_client <- read.csv("~/Downloads/Data mining/CD_additional_balanced.csv", stringsAsFactors = FALSE)
#read_excel("~/Downloads/Data mining/CD_metadata.xlsx")

#Examined the structure of the Dataset using below command. Quite a few variables with definite levels are of 'chr' type. 
str(bank_client)  
## 'data.frame':    9280 obs. of  21 variables:
##  $ age           : int  41 49 49 41 45 42 39 28 44 42 ...
##  $ job           : chr  "blue-collar" "entrepreneur" "technician" "technician" ...
##  $ marital       : chr  "divorced" "married" "married" "married" ...
##  $ education     : chr  "basic.4y" "university.degree" "basic.9y" "professional.course" ...
##  $ default       : chr  "unknown" "unknown" "no" "unknown" ...
##  $ housing       : chr  "yes" "yes" "no" "yes" ...
##  $ loan          : chr  "no" "no" "no" "no" ...
##  $ contact       : chr  "telephone" "telephone" "telephone" "telephone" ...
##  $ month         : chr  "may" "may" "may" "may" ...
##  $ day_of_week   : chr  "mon" "mon" "mon" "mon" ...
##  $ duration      : int  1575 1042 1467 579 461 673 935 1201 1030 1623 ...
##  $ campaign      : int  1 1 1 1 1 2 3 1 1 1 ...
##  $ pdays         : int  999 999 999 999 999 999 999 999 999 999 ...
##  $ previous      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ poutcome      : chr  "nonexistent" "nonexistent" "nonexistent" "nonexistent" ...
##  $ emp.var.rate  : num  1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
##  $ cons.price.idx: num  94 94 94 94 94 ...
##  $ cons.conf.idx : num  -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
##  $ euribor3m     : num  4.86 4.86 4.86 4.86 4.86 ...
##  $ nr.employed   : num  5191 5191 5191 5191 5191 ...
##  $ y             : chr  "yes" "yes" "yes" "yes" ...

Transforming categorical variables with definite values to correct data type:

bank_client$job <- as.factor(bank_client$job)
bank_client$marital <- as.factor(bank_client$marital)
bank_client$education <- as.factor(bank_client$education) 
bank_client$default <- as.factor(bank_client$default) 
bank_client$housing <- as.factor(bank_client$housing)
bank_client$loan <- as.factor(bank_client$loan)
bank_client$contact <- as.factor(bank_client$contact)
bank_client$month <- as.factor(bank_client$month)
bank_client$day_of_week <- as.factor(bank_client$day_of_week)
bank_client$poutcome <- as.factor(bank_client$poutcome)  
bank_client$y <- as.factor(bank_client$y)

# Analyzing the data again:
str(bank_client)
## 'data.frame':    9280 obs. of  21 variables:
##  $ age           : int  41 49 49 41 45 42 39 28 44 42 ...
##  $ job           : Factor w/ 12 levels "admin.","blue-collar",..: 2 3 10 10 2 2 4 12 8 10 ...
##  $ marital       : Factor w/ 4 levels "divorced","married",..: 1 2 2 2 2 2 2 3 2 2 ...
##  $ education     : Factor w/ 8 levels "basic.4y","basic.6y",..: 1 7 3 6 3 3 3 8 4 6 ...
##  $ default       : Factor w/ 2 levels "no","unknown": 2 2 1 2 2 1 1 2 1 1 ...
##  $ housing       : Factor w/ 3 levels "no","unknown",..: 3 3 1 3 3 3 3 3 3 1 ...
##  $ loan          : Factor w/ 3 levels "no","unknown",..: 1 1 1 1 1 3 1 3 1 1 ...
##  $ contact       : Factor w/ 2 levels "cellular","telephone": 2 2 2 2 2 2 2 2 2 2 ...
##  $ month         : Factor w/ 10 levels "apr","aug","dec",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ day_of_week   : Factor w/ 5 levels "fri","mon","thu",..: 2 2 2 2 2 2 2 4 4 4 ...
##  $ duration      : int  1575 1042 1467 579 461 673 935 1201 1030 1623 ...
##  $ campaign      : int  1 1 1 1 1 2 3 1 1 1 ...
##  $ pdays         : int  999 999 999 999 999 999 999 999 999 999 ...
##  $ previous      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ poutcome      : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ emp.var.rate  : num  1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
##  $ cons.price.idx: num  94 94 94 94 94 ...
##  $ cons.conf.idx : num  -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
##  $ euribor3m     : num  4.86 4.86 4.86 4.86 4.86 ...
##  $ nr.employed   : num  5191 5191 5191 5191 5191 ...
##  $ y             : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...

To check the data distribution across different variables:

summary(bank_client)
##       age                job           marital    
##  Min.   :17.0   admin.     :2517   divorced:1021  
##  1st Qu.:31.0   blue-collar:1769   married :5338  
##  Median :38.0   technician :1459   single  :2900  
##  Mean   :40.4   services   : 773   unknown :  21  
##  3rd Qu.:48.0   management : 651                  
##  Max.   :98.0   retired    : 595                  
##                 (Other)    :1516                  
##                education       default        housing          loan     
##  university.degree  :3007   no     :7824   no     :4104   no     :7688  
##  high.school        :2102   unknown:1456   unknown: 225   unknown: 225  
##  professional.course:1190                  yes    :4951   yes    :1367  
##  basic.9y           :1177                                               
##  basic.4y           : 895                                               
##  basic.6y           : 458                                               
##  (Other)            : 451                                               
##       contact         month      day_of_week    duration     
##  cellular :6672   may    :2533   fri:1763    Min.   :   1.0  
##  telephone:2608   jul    :1477   mon:1846    1st Qu.: 145.0  
##                   aug    :1353   thu:2000    Median : 265.0  
##                   jun    :1169   tue:1810    Mean   : 387.4  
##                   nov    : 886   wed:1861    3rd Qu.: 528.0  
##                   apr    : 785               Max.   :4199.0  
##                   (Other):1077                               
##     campaign          pdays          previous             poutcome   
##  Min.   : 1.000   Min.   :  0.0   Min.   :0.0000   failure    :1074  
##  1st Qu.: 1.000   1st Qu.:999.0   1st Qu.:0.0000   nonexistent:7244  
##  Median : 2.000   Median :999.0   Median :0.0000   success    : 962  
##  Mean   : 2.333   Mean   :887.3   Mean   :0.3153                     
##  3rd Qu.: 3.000   3rd Qu.:999.0   3rd Qu.:0.0000                     
##  Max.   :39.000   Max.   :999.0   Max.   :6.0000                     
##                                                                      
##   emp.var.rate     cons.price.idx  cons.conf.idx      euribor3m    
##  Min.   :-3.4000   Min.   :92.20   Min.   :-50.80   Min.   :0.634  
##  1st Qu.:-1.8000   1st Qu.:92.89   1st Qu.:-42.70   1st Qu.:1.244  
##  Median :-0.1000   Median :93.44   Median :-41.80   Median :4.021  
##  Mean   :-0.4963   Mean   :93.48   Mean   :-40.22   Mean   :2.960  
##  3rd Qu.: 1.4000   3rd Qu.:93.99   3rd Qu.:-36.40   3rd Qu.:4.959  
##  Max.   : 1.4000   Max.   :94.77   Max.   :-26.90   Max.   :5.045  
##                                                                    
##   nr.employed     y       
##  Min.   :4964   no :4640  
##  1st Qu.:5076   yes:4640  
##  Median :5191             
##  Mean   :5135             
##  3rd Qu.:5228             
##  Max.   :5228             
## 

2. Exploring the Numeric variables via different visualizations-age, duration, campaign, and pdays:

A. Histograms:

age_hist <- hist(bank_client$age, breaks = 12, col = "lightblue", border = "red", main = "Histogram of 'age' of Clients", xlab = "Age", ylab = "No. of clients")

duration_hist <- hist(bank_client$duration, breaks = 15, col = "green", border = "black", main = "Histogram of last contact 'duration'", xlab = "duration in secs", ylab = "No. of clients")

campaign_hist <- hist(bank_client$campaign, breaks = 25, col = "lightpink", border = "blue", main = "Histogram of no. of contacts during campaign", xlab = "no of contacts", ylab = "clients")

pdays_hist <- hist(bank_client$pdays, breaks = 25, col = "yellow", border = "green", main = "No. of days ofter a client was last contacted", xlab = "no of days", ylab = "clients")

B. Boxplots:

age_box <- boxplot(bank_client$age, boxwex = 1.0, col = c("orange", "yellow"), main = "Box plot of client 'age'", ylab = "Age")

duration_box <- boxplot(bank_client$duration, boxwex = 1.0, col = c("green", "yellow"), main = "Box plot of last contact 'duration'",  ylab = "duration in secs")

campaign_box <- boxplot(bank_client$campaign, boxwex = 1.0, col = c("lightblue", "yellow"), main = "Box plot of contacts during campaign", ylab = "no of contacts")

pdays_box <- boxplot(bank_client$pdays, boxwex = 1.0, col = c("lightblue", "green"), main = "No. of days after a client was last contacted", ylab = "no of days")

C. Deciles of the variable:

# decile of age:
quantile(bank_client$age, seq(from = 0, to =  1, by = 0.10))
##   0%  10%  20%  30%  40%  50%  60%  70%  80%  90% 100% 
##   17   27   30   33   35   38   41   46   51   57   98
# decile of duration:
quantile(bank_client$duration, seq(from = 0, to =  1, by = 0.10))
##   0%  10%  20%  30%  40%  50%  60%  70%  80%  90% 100% 
##    1   80  124  167  211  265  340  452  615  860 4199
# decile of campaign:
quantile(bank_client$campaign, seq(from = 0, to =  1, by = 0.10))
##   0%  10%  20%  30%  40%  50%  60%  70%  80%  90% 100% 
##    1    1    1    1    1    2    2    2    3    4   39
# decile of pdays:
quantile(bank_client$pdays, seq(from = 0, to =  1, by = 0.10))
##   0%  10%  20%  30%  40%  50%  60%  70%  80%  90% 100% 
##    0   11  999  999  999  999  999  999  999  999  999

3. Exploring factor variables:

A.1 Count value of instances per level for each categorical variable:

#Job
table(bank_client$job)
## 
##        admin.   blue-collar  entrepreneur     housemaid    management 
##          2517          1769           308           216           651 
##       retired self-employed      services       student    technician 
##           595           306           773           358          1459 
##    unemployed       unknown 
##           248            80
#marital
table(bank_client$marital)
## 
## divorced  married   single  unknown 
##     1021     5338     2900       21
#education
table(bank_client$education)
## 
##            basic.4y            basic.6y            basic.9y 
##                 895                 458                1177 
##         high.school          illiterate professional.course 
##                2102                   6                1190 
##   university.degree             unknown 
##                3007                 445
#default 
table(bank_client$default)
## 
##      no unknown 
##    7824    1456
#housing
table(bank_client$housing)
## 
##      no unknown     yes 
##    4104     225    4951
#loan
table(bank_client$loan)
## 
##      no unknown     yes 
##    7688     225    1367
#contact
table(bank_client$contact)
## 
##  cellular telephone 
##      6672      2608
#month
table(bank_client$month)
## 
##  apr  aug  dec  jul  jun  mar  may  nov  oct  sep 
##  785 1353  100 1477 1169  313 2533  886  369  295
#day_of_week
table(bank_client$day_of_week)
## 
##  fri  mon  thu  tue  wed 
## 1763 1846 2000 1810 1861
#poutcome
table(bank_client$poutcome)
## 
##     failure nonexistent     success 
##        1074        7244         962
#y
table(bank_client$y)
## 
##   no  yes 
## 4640 4640

A.2 Percentage value of instances belonging to that level.

#Job
prop.table(table(bank_client$job))
## 
##        admin.   blue-collar  entrepreneur     housemaid    management 
##    0.27122845    0.19062500    0.03318966    0.02327586    0.07015086 
##       retired self-employed      services       student    technician 
##    0.06411638    0.03297414    0.08329741    0.03857759    0.15721983 
##    unemployed       unknown 
##    0.02672414    0.00862069
#marital
prop.table(table(bank_client$marital))
## 
##    divorced     married      single     unknown 
## 0.110021552 0.575215517 0.312500000 0.002262931
#education
prop.table(table(bank_client$education))
## 
##            basic.4y            basic.6y            basic.9y 
##        0.0964439655        0.0493534483        0.1268318966 
##         high.school          illiterate professional.course 
##        0.2265086207        0.0006465517        0.1282327586 
##   university.degree             unknown 
##        0.3240301724        0.0479525862
#default 
prop.table(table(bank_client$default))
## 
##        no   unknown 
## 0.8431034 0.1568966
#housing
prop.table(table(bank_client$housing))
## 
##         no    unknown        yes 
## 0.44224138 0.02424569 0.53351293
#loan
prop.table(table(bank_client$loan))
## 
##         no    unknown        yes 
## 0.82844828 0.02424569 0.14730603
#contact
prop.table(table(bank_client$contact))
## 
##  cellular telephone 
## 0.7189655 0.2810345
#month
prop.table(table(bank_client$month))
## 
##        apr        aug        dec        jul        jun        mar 
## 0.08459052 0.14579741 0.01077586 0.15915948 0.12596983 0.03372845 
##        may        nov        oct        sep 
## 0.27295259 0.09547414 0.03976293 0.03178879
#day_of_week
prop.table(table(bank_client$day_of_week))
## 
##       fri       mon       thu       tue       wed 
## 0.1899784 0.1989224 0.2155172 0.1950431 0.2005388
#poutcome
prop.table(table(bank_client$poutcome))
## 
##     failure nonexistent     success 
##   0.1157328   0.7806034   0.1036638
#y
prop.table(table(bank_client$y))
## 
##  no yes 
## 0.5 0.5

B. Bar plotting 2 of the above factor variables. The number of instances within a level name for each possible value of these variables is depicted.

#Here we can notice the frequency of last call made by month: 
barplot(sort(table(bank_client$month), decreasing = TRUE), main = "Barplot of last contact month ", border = "dark blue", density = 20, col = "yellow")

#
barplot(sort(table(bank_client$marital), decreasing = TRUE), main = "Barplot of marital status of Clients", border = "dark green", density = 20, col = "black")

4. Exploring the relationship among various variables:

A. Using ‘cor’ and ’pair.panel’function to analyse the correlation:

cor(bank_client[c('age', 'duration', 'euribor3m', 'emp.var.rate', 'nr.employed', 'pdays', 'campaign')])
##                       age    duration   euribor3m emp.var.rate nr.employed
## age           1.000000000 -0.02072651 -0.04462745  -0.04905263 -0.07468652
## duration     -0.020726510  1.00000000  0.05733951   0.07144035  0.05823209
## euribor3m    -0.044627449  0.05733951  1.00000000   0.95840218  0.94054583
## emp.var.rate -0.049052629  0.07144035  0.95840218   1.00000000  0.86752989
## nr.employed  -0.074686516  0.05823209  0.94054583   0.86752989  1.00000000
## pdays        -0.053516156  0.02893622  0.38773934   0.33488799  0.47499217
## campaign      0.003690016 -0.02587247  0.17512283   0.18573619  0.17697221
##                    pdays     campaign
## age          -0.05351616  0.003690016
## duration      0.02893622 -0.025872465
## euribor3m     0.38773934  0.175122827
## emp.var.rate  0.33488799  0.185736186
## nr.employed   0.47499217  0.176972215
## pdays         1.00000000  0.089300624
## campaign      0.08930062  1.000000000
pairs.panels(bank_client[c('age', 'duration', 'euribor3m', 'emp.var.rate', 'nr.employed', 'pdays', 'campaign')])

B.i. Using ‘boxplot’ to explore the relationship among previous variables:

# numeric variables - duration, emp.var.rate, cons.price.idx, cons.conf.idx, age, campaign, pday, euribor3m, and nr.employed.
boxplot(duration ~ y, bank_client, main = "Boxplot of duration")

boxplot(cons.conf.idx ~ y, bank_client, main = "Boxplot of Consumer confidence index")

boxplot(emp.var.rate ~ y, bank_client, main = "Boxplot of employment variation rate")

boxplot(cons.price.idx ~ y, bank_client, main = "Boxplot of consumer price index")

boxplot(age ~ y, bank_client, main = "Boxplot of age")

boxplot(campaign ~ y, bank_client, main = "Boxplot of campaign")

boxplot(pdays ~ y, bank_client, main = "Boxplot of pdays")

boxplot(euribor3m ~ y, bank_client, main = "Boxplot of Euro bank quarterly rate ")

boxplot(nr.employed ~ y, bank_client, main = "Boxplot of number of employees")

# 4.B.ii Using the ‘aggregate’ function to analyze the spectrum of these variables by ‘y’ i.e. conversion to CD.

aggregate(cons.conf.idx ~ y, summary, data = bank_client)
##     y cons.conf.idx.Min. cons.conf.idx.1st Qu. cons.conf.idx.Median
## 1  no          -50.80000             -42.70000            -41.80000
## 2 yes          -50.80000             -46.20000            -40.40000
##   cons.conf.idx.Mean cons.conf.idx.3rd Qu. cons.conf.idx.Max.
## 1          -40.64647             -36.40000          -26.90000
## 2          -39.78978             -36.10000          -26.90000
aggregate(cons.price.idx ~ y, summary, data = bank_client)
##     y cons.price.idx.Min. cons.price.idx.1st Qu. cons.price.idx.Median
## 1  no            92.20100               93.07500              93.91800
## 2 yes            92.20100               92.89300              93.20000
##   cons.price.idx.Mean cons.price.idx.3rd Qu. cons.price.idx.Max.
## 1            93.60397               93.99400            94.76700
## 2            93.35439               93.91800            94.76700
aggregate(duration ~ y, summary, data = bank_client)
##     y duration.Min. duration.1st Qu. duration.Median duration.Mean
## 1  no        1.0000          94.0000        166.0000      221.5323
## 2 yes       37.0000         253.0000        449.0000      553.1912
##   duration.3rd Qu. duration.Max.
## 1         279.2500     1994.0000
## 2         741.2500     4199.0000
aggregate(emp.var.rate ~ y, summary, data = bank_client)
##     y emp.var.rate.Min. emp.var.rate.1st Qu. emp.var.rate.Median
## 1  no        -3.4000000           -1.8000000           1.1000000
## 2 yes        -3.4000000           -1.8000000          -1.8000000
##   emp.var.rate.Mean emp.var.rate.3rd Qu. emp.var.rate.Max.
## 1         0.2409052            1.4000000         1.4000000
## 2        -1.2334483           -0.1000000         1.4000000
aggregate(campaign ~ y, summary, data = bank_client)
##     y campaign.Min. campaign.1st Qu. campaign.Median campaign.Mean
## 1  no      1.000000         1.000000        2.000000      2.614871
## 2 yes      1.000000         1.000000        2.000000      2.051724
##   campaign.3rd Qu. campaign.Max.
## 1         3.000000     39.000000
## 2         2.000000     23.000000
aggregate(pdays ~ y, summary, data = bank_client)
##     y pdays.Min. pdays.1st Qu. pdays.Median pdays.Mean pdays.3rd Qu.
## 1  no     0.0000      999.0000     999.0000   982.5293      999.0000
## 2 yes     0.0000      999.0000     999.0000   792.0356      999.0000
##   pdays.Max.
## 1   999.0000
## 2   999.0000
aggregate(euribor3m ~ y, summary, data = bank_client)
##     y euribor3m.Min. euribor3m.1st Qu. euribor3m.Median euribor3m.Mean
## 1  no       0.635000          1.405000         4.857000       3.797283
## 2 yes       0.634000          0.849000         1.266000       2.123135
##   euribor3m.3rd Qu. euribor3m.Max.
## 1          4.962000       4.970000
## 2          4.406000       5.045000
aggregate(age ~ y, summary, data = bank_client)
##     y age.Min. age.1st Qu. age.Median age.Mean age.3rd Qu. age.Max.
## 1  no 17.00000    32.00000   38.00000 39.89375    47.00000 88.00000
## 2 yes 17.00000    31.00000   37.00000 40.91315    50.00000 98.00000
aggregate(nr.employed ~ y, summary, data = bank_client)
##     y nr.employed.Min. nr.employed.1st Qu. nr.employed.Median
## 1  no         4963.600            5099.100           5195.800
## 2 yes         4963.600            5017.500           5099.100
##   nr.employed.Mean nr.employed.3rd Qu. nr.employed.Max.
## 1         5175.497            5228.100         5228.100
## 2         5095.116            5191.000         5228.100

4.C. Here we visualize ‘y’ for a combination of variables:

scatterplot3d(bank_client$age, bank_client$duration, bank_client$campaign, pch = as.numeric(bank_client$y), main = "3D scatterpot of bank_client")
legend('topright', legend = levels(bank_client$y), cex = 0.8, pch = 1:2 )

scatterplot3d(bank_client$nr.employed, bank_client$duration, bank_client$euribor3m, pch = as.numeric(bank_client$y), main = "3D scatterpot of bank_client")
legend('topright', legend = levels(bank_client$y), cex = 0.8, pch = 1:2 )

Hereon, we begin the process to model the data so we can predict the if a client will subscribe to a ‘Certified term deposit(y)’.

5. Data preparation:

To model data, we split the data in two sets. A ‘Train’ dataset using which we will create a model. And a ‘Test’ dataset on which we will apply on our model and predict the output.

5.A.

set.seed(888)

inTrain <- createDataPartition(bank_client$y, p=0.7, list = FALSE)

#Training set
bank_client_Train <- bank_client[inTrain, ]
#Testing set
bank_client_Test <- bank_client[-inTrain, ]

summary(bank_client_Train)
##       age                 job           marital    
##  Min.   :17.00   admin.     :1749   divorced: 703  
##  1st Qu.:31.00   blue-collar:1260   married :3733  
##  Median :38.00   technician :1022   single  :2046  
##  Mean   :40.43   services   : 564   unknown :  14  
##  3rd Qu.:48.00   management : 444                  
##  Max.   :98.00   retired    : 415                  
##                  (Other)    :1042                  
##                education       default        housing          loan     
##  university.degree  :2077   no     :5472   no     :2840   no     :5383  
##  high.school        :1465   unknown:1024   unknown: 159   unknown: 159  
##  professional.course: 843                  yes    :3497   yes    : 954  
##  basic.9y           : 838                                               
##  basic.4y           : 625                                               
##  unknown            : 329                                               
##  (Other)            : 319                                               
##       contact         month      day_of_week    duration     
##  cellular :4668   may    :1815   fri:1271    Min.   :   1.0  
##  telephone:1828   jul    :1033   mon:1276    1st Qu.: 144.0  
##                   aug    : 945   thu:1387    Median : 266.0  
##                   jun    : 816   tue:1264    Mean   : 386.6  
##                   nov    : 595   wed:1298    3rd Qu.: 530.0  
##                   apr    : 558               Max.   :4199.0  
##                   (Other): 734                               
##     campaign          pdays          previous             poutcome   
##  Min.   : 1.000   Min.   :  0.0   Min.   :0.0000   failure    : 761  
##  1st Qu.: 1.000   1st Qu.:999.0   1st Qu.:0.0000   nonexistent:5062  
##  Median : 2.000   Median :999.0   Median :0.0000   success    : 673  
##  Mean   : 2.347   Mean   :886.7   Mean   :0.3193                     
##  3rd Qu.: 3.000   3rd Qu.:999.0   3rd Qu.:0.0000                     
##  Max.   :39.000   Max.   :999.0   Max.   :6.0000                     
##                                                                      
##   emp.var.rate     cons.price.idx  cons.conf.idx      euribor3m    
##  Min.   :-3.4000   Min.   :92.20   Min.   :-50.80   Min.   :0.634  
##  1st Qu.:-1.8000   1st Qu.:92.89   1st Qu.:-42.70   1st Qu.:1.250  
##  Median :-0.1000   Median :93.44   Median :-41.80   Median :4.021  
##  Mean   :-0.4814   Mean   :93.48   Mean   :-40.28   Mean   :2.969  
##  3rd Qu.: 1.4000   3rd Qu.:93.99   3rd Qu.:-36.40   3rd Qu.:4.959  
##  Max.   : 1.4000   Max.   :94.77   Max.   :-26.90   Max.   :5.045  
##                                                                    
##   nr.employed     y       
##  Min.   :4964   no :3248  
##  1st Qu.:5076   yes:3248  
##  Median :5191             
##  Mean   :5136             
##  3rd Qu.:5228             
##  Max.   :5228             
## 
summary(bank_client_Test)
##       age                 job          marital    
##  Min.   :17.00   admin.     :768   divorced: 318  
##  1st Qu.:31.00   blue-collar:509   married :1605  
##  Median :38.00   technician :437   single  : 854  
##  Mean   :40.35   services   :209   unknown :   7  
##  3rd Qu.:48.00   management :207                  
##  Max.   :88.00   retired    :180                  
##                  (Other)    :474                  
##                education      default        housing          loan     
##  university.degree  :930   no     :2352   no     :1264   no     :2305  
##  high.school        :637   unknown: 432   unknown:  66   unknown:  66  
##  professional.course:347                  yes    :1454   yes    : 413  
##  basic.9y           :339                                               
##  basic.4y           :270                                               
##  basic.6y           :143                                               
##  (Other)            :118                                               
##       contact         month     day_of_week    duration     
##  cellular :2004   may    :718   fri:492     Min.   :   1.0  
##  telephone: 780   jul    :444   mon:570     1st Qu.: 146.0  
##                   aug    :408   thu:613     Median : 264.0  
##                   jun    :353   tue:546     Mean   : 389.1  
##                   nov    :291   wed:563     3rd Qu.: 517.0  
##                   apr    :227               Max.   :2692.0  
##                   (Other):343                               
##     campaign          pdays          previous            poutcome   
##  Min.   : 1.000   Min.   :  0.0   Min.   :0.000   failure    : 313  
##  1st Qu.: 1.000   1st Qu.:999.0   1st Qu.:0.000   nonexistent:2182  
##  Median : 2.000   Median :999.0   Median :0.000   success    : 289  
##  Mean   : 2.302   Mean   :888.7   Mean   :0.306                     
##  3rd Qu.: 3.000   3rd Qu.:999.0   3rd Qu.:0.000                     
##  Max.   :29.000   Max.   :999.0   Max.   :5.000                     
##                                                                     
##   emp.var.rate     cons.price.idx  cons.conf.idx      euribor3m    
##  Min.   :-3.4000   Min.   :92.20   Min.   :-50.80   Min.   :0.635  
##  1st Qu.:-1.8000   1st Qu.:92.89   1st Qu.:-42.70   1st Qu.:1.057  
##  Median :-1.1000   Median :93.44   Median :-41.80   Median :1.811  
##  Mean   :-0.5311   Mean   :93.47   Mean   :-40.08   Mean   :2.940  
##  3rd Qu.: 1.4000   3rd Qu.:93.99   3rd Qu.:-36.40   3rd Qu.:4.959  
##  Max.   : 1.4000   Max.   :94.77   Max.   :-26.90   Max.   :5.045  
##                                                                    
##   nr.employed     y       
##  Min.   :4964   no :1392  
##  1st Qu.:5076   yes:1392  
##  Median :5099             
##  Mean   :5134             
##  3rd Qu.:5228             
##  Max.   :5228             
## 

We analyse our traget variable ‘y’ in our new Train and Test datasets using distribution:

table(bank_client_Train$y)
## 
##   no  yes 
## 3248 3248
table(bank_client_Test$y)
## 
##   no  yes 
## 1392 1392
prop.table(table(bank_client_Train$y))
## 
##  no yes 
## 0.5 0.5
prop.table(table(bank_client_Test$y))
## 
##  no yes 
## 0.5 0.5

6. We train our first Decision Tree model to eventually predict y. Let us consider this as our Baseline model.

client_m1_c50 <- C5.0(y~., bank_client_Train)
client_m1_c50
## 
## Call:
## C5.0.formula(formula = y ~ ., data = bank_client_Train)
## 
## Classification Tree
## Number of samples: 6496 
## Number of predictors: 20 
## 
## Tree size: 57 
## 
## Non-standard options: attempt to group attributes
#we could but we don't plot the tree as of now as there are far too many nodes.

summary(client_m1_c50)
## 
## Call:
## C5.0.formula(formula = y ~ ., data = bank_client_Train)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Sun Jan 27 23:41:41 2019
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 6496 cases (21 attributes) from undefined.data
## 
## Decision tree:
## 
## nr.employed <= 5076.2:
## :...duration > 158: yes (1474/107)
## :   duration <= 158:
## :   :...duration <= 62: no (16)
## :       duration > 62:
## :       :...pdays <= 3: yes (42/6)
## :           pdays > 3:
## :           :...campaign > 3:
## :               :...nr.employed <= 5023.5: no (13/1)
## :               :   nr.employed > 5023.5:
## :               :   :...day_of_week in {fri,mon}: yes (5)
## :               :       day_of_week in {thu,tue,wed}: no (5)
## :               campaign <= 3:
## :               :...day_of_week in {thu,tue,wed}: yes (109/34)
## :                   day_of_week in {fri,mon}:
## :                   :...month in {dec,jun,mar,may,nov}: no (40/15)
## :                       month in {jul,oct}: yes (21/8)
## :                       month = apr:
## :                       :...previous <= 0: yes (3)
## :                       :   previous > 0: no (2)
## :                       month = aug:
## :                       :...education in {basic.4y,basic.6y,high.school,
## :                       :   :             illiterate,professional.course,
## :                       :   :             university.degree,unknown}: no (14)
## :                       :   education = basic.9y: yes (2)
## :                       month = sep:
## :                       :...duration <= 101: yes (2)
## :                           duration > 101: no (4)
## nr.employed > 5076.2:
## :...duration > 438:
##     :...duration > 649: yes (996/113)
##     :   duration <= 649:
##     :   :...contact = telephone:
##     :       :...emp.var.rate <= -0.1: yes (16/2)
##     :       :   emp.var.rate > -0.1:
##     :       :   :...cons.price.idx > 94.215: yes (51/17)
##     :       :       cons.price.idx <= 94.215:
##     :       :       :...duration <= 532: no (50/9)
##     :       :           duration > 532:
##     :       :           :...day_of_week = fri: yes (12/1)
##     :       :               day_of_week in {mon,thu,tue,wed}: no (46/20)
##     :       contact = cellular:
##     :       :...euribor3m <= 4.021: yes (229/46)
##     :           euribor3m > 4.021:
##     :           :...emp.var.rate <= -0.1:
##     :               :...education in {basic.4y,basic.6y,basic.9y,high.school,
##     :               :   :             illiterate,professional.course,
##     :               :   :             unknown}: no (27/8)
##     :               :   education = university.degree:
##     :               :   :...euribor3m <= 4.153: yes (12/1)
##     :               :       euribor3m > 4.153: no (5/1)
##     :               emp.var.rate > -0.1:
##     :               :...duration > 486: yes (134/32)
##     :                   duration <= 486:
##     :                   :...default = unknown: no (7)
##     :                       default = no:
##     :                       :...campaign <= 2: no (13/2)
##     :                           campaign > 2: yes (12/2)
##     duration <= 438:
##     :...month = sep: no (0)
##         month in {aug,jul,jun,may,nov}:
##         :...euribor3m > 1.27:
##         :   :...contact = cellular: no (1370/68)
##         :   :   contact = telephone:
##         :   :   :...euribor3m <= 4.191: no (48/3)
##         :   :       euribor3m > 4.191:
##         :   :       :...euribor3m <= 4.663: yes (10/1)
##         :   :           euribor3m > 4.663: no (1048/13)
##         :   euribor3m <= 1.27:
##         :   :...duration <= 175: no (113/8)
##         :       duration > 175:
##         :       :...day_of_week = thu: no (26/3)
##         :           day_of_week in {fri,mon,tue,wed}:
##         :           :...euribor3m > 1.252: yes (39/4)
##         :               euribor3m <= 1.252:
##         :               :...duration <= 284: no (14)
##         :                   duration > 284:
##         :                   :...marital in {divorced,single,unknown}: yes (6)
##         :                       marital = married:
##         :                       :...duration <= 325: yes (2)
##         :                           duration > 325: no (5)
##         month in {apr,dec,mar,oct}:
##         :...duration <= 90:
##             :...day_of_week in {fri,mon,wed}: no (34/1)
##             :   day_of_week in {thu,tue}:
##             :   :...euribor3m <= 1.629: no (11/2)
##             :       euribor3m > 1.629: yes (5)
##             duration > 90:
##             :...default = unknown: no (23/6)
##                 default = no:
##                 :...cons.price.idx > 93.369: yes (23)
##                     cons.price.idx <= 93.369:
##                     :...poutcome = failure:
##                         :...day_of_week in {fri,mon}: no (18/3)
##                         :   day_of_week = wed: yes (9/2)
##                         :   day_of_week = thu:
##                         :   :...duration <= 188: no (2)
##                         :   :   duration > 188: yes (6)
##                         :   day_of_week = tue:
##                         :   :...education in {basic.4y,
##                         :       :             university.degree}: yes (4)
##                         :       education in {basic.6y,basic.9y,high.school,
##                         :                     illiterate,professional.course,
##                         :                     unknown}: no (4)
##                         poutcome in {nonexistent,success}:
##                         :...cons.conf.idx <= -49.5: yes (82/9)
##                             cons.conf.idx > -49.5:
##                             :...day_of_week in {mon,thu,tue,
##                                 :               wed}: yes (198/47)
##                                 day_of_week = fri:
##                                 :...education in {basic.4y,basic.6y,basic.9y,
##                                     :             high.school,
##                                     :             illiterate}: no (16/1)
##                                     education in {professional.course,
##                                     :             university.degree,unknown}:
##                                     :...euribor3m <= 1.435: yes (14/3)
##                                         euribor3m > 1.435: no (4/1)
## 
## 
## Evaluation on training data (6496 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##      56  600( 9.2%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##    2813   435    (a): class no
##     165  3083    (b): class yes
## 
## 
##  Attribute usage:
## 
##  100.00% duration
##  100.00% nr.employed
##   49.60% month
##   48.55% euribor3m
##   47.57% contact
##   10.50% day_of_week
##    8.30% cons.price.idx
##    6.70% default
##    5.93% emp.var.rate
##    5.50% poutcome
##    4.83% cons.conf.idx
##    4.03% pdays
##    3.77% campaign
##    1.57% education
##    0.20% marital
##    0.08% previous
## 
## 
## Time: 0.0 secs

6.B. We now create our 1st prediction model using Train dataset and apply it on Test set. We evaluate its perfomance using confusion matrix and the function ‘mmetric’.

# Applying the baseline model to test dateset:

predicted_client_test1 <- predict(client_m1_c50, bank_client_Test)
mmetric(bank_client_Test$y, predicted_client_test1, metric = 'CONF')
## $res
## NULL
## 
## $conf
##       pred
## target   no  yes
##    no  1132  260
##    yes  117 1275
## 
## $roc
## NULL
## 
## $lift
## NULL
mmetric(bank_client_Test$y, predicted_client_test1, metric = c('ACC', 'TPR', 'PRECISION', 'F1'))
##        ACC       TPR1       TPR2 PRECISION1 PRECISION2        F11 
##   86.45833   81.32184   91.59483   90.63251   83.06189   85.72510 
##        F12 
##   87.11992
predicted_client_train1 <- predict(client_m1_c50, bank_client_Train)
mmetric(bank_client_Train$y, predicted_client_train1, metric = 'CONF')
## $res
## NULL
## 
## $conf
##       pred
## target   no  yes
##    no  2813  435
##    yes  165 3083
## 
## $roc
## NULL
## 
## $lift
## NULL
mmetric(bank_client_Train$y, predicted_client_train1, metric = c('ACC', 'TPR', 'PRECISION', 'F1'))
##        ACC       TPR1       TPR2 PRECISION1 PRECISION2        F11 
##   90.76355   86.60714   94.91995   94.45937   87.63502   90.36299 
##        F12 
##   91.13213

As seen from above result, our model prediction was very good. The model when applied on Train set gave an accuracy of around 90% which very good. However, the decision tree is too big and the prediction could possibly result from an overfit.

We using Pruning to reduce the size of the decision tree. This will reduce the accuracy on the training data, but generally increase the accuracy on Test data. As seen above, decision trees we learn is so specific that it doesn’t apply to anything but that training data. Hence to overcome the possiblity of overfit, we simplify the Tree by adjusting the Confidence factor(CF).

7. Simplifing the decision tree and analysing it using plot and summary functions:

A.

# using CF=0.002
client_m2_C50 <- C5.0(y~., bank_client_Train, control = C5.0Control(CF = 0.002))
client_m2_C50
## 
## Call:
## C5.0.formula(formula = y ~ ., data = bank_client_Train, control
##  = C5.0Control(CF = 0.002))
## 
## Classification Tree
## Number of samples: 6496 
## Number of predictors: 20 
## 
## Tree size: 8 
## 
## Non-standard options: attempt to group attributes, confidence level: 0.002
plot(client_m2_C50)

summary(client_m2_C50)
## 
## Call:
## C5.0.formula(formula = y ~ ., data = bank_client_Train, control
##  = C5.0Control(CF = 0.002))
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Sun Jan 27 23:41:42 2019
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 6496 cases (21 attributes) from undefined.data
## 
## Decision tree:
## 
## nr.employed <= 5076.2:
## :...duration > 158: yes (1474/107)
## :   duration <= 158:
## :   :...duration <= 62: no (16)
## :       duration > 62:
## :       :...campaign <= 3: yes (239/93)
## :           campaign > 3: no (23/6)
## nr.employed > 5076.2:
## :...duration > 438: yes (1610/322)
##     duration <= 438:
##     :...month in {aug,jul,jun,may,nov,sep}: no (2681/147)
##         month in {apr,dec,mar,oct}:
##         :...duration <= 90: no (50/8)
##             duration > 90: yes (403/117)
## 
## 
## Evaluation on training data (6496 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##       8  800(12.3%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##    2609   639    (a): class no
##     161  3087    (b): class yes
## 
## 
##  Attribute usage:
## 
##  100.00% duration
##  100.00% nr.employed
##   48.25% month
##    4.03% campaign
## 
## 
## Time: 0.0 secs

7.B.: We now use our simplified Tree to create another prediction model and apply it on Test and Train sets:

predicted_client_test2 <- predict(client_m2_C50, bank_client_Test)
mmetric(bank_client_Test$y, predicted_client_test2, metric = 'CONF')
## $res
## NULL
## 
## $conf
##       pred
## target   no  yes
##    no  1076  316
##    yes   83 1309
## 
## $roc
## NULL
## 
## $lift
## NULL
mmetric(bank_client_Test$y, predicted_client_test2, metric = c('ACC', 'TPR', 'PRECISION', 'F1'))
##        ACC       TPR1       TPR2 PRECISION1 PRECISION2        F11 
##   85.66810   77.29885   94.03736   92.83865   80.55385   84.35907 
##        F12 
##   86.77494
predicted_client_train2 <- predict(client_m2_C50, bank_client_Train)
mmetric(bank_client_Train$y, predicted_client_train2, metric = 'CONF')
## $res
## NULL
## 
## $conf
##       pred
## target   no  yes
##    no  2609  639
##    yes  161 3087
## 
## $roc
## NULL
## 
## $lift
## NULL
mmetric(bank_client_Train$y, predicted_client_train2, metric = c('ACC', 'TPR', 'PRECISION', 'F1'))
##        ACC       TPR1       TPR2 PRECISION1 PRECISION2        F11 
##   87.68473   80.32635   95.04310   94.18773   82.85024   86.70655 
##        F12 
##   88.52882

Notice that the prediction accuracy of our model has reduced slightly.

8. We now remove the variable ‘duration’ from our model to analyse the effect in predicting ‘y’

A

client_m3_C50 <- C5.0(bank_client_Train[c(-11,-21)], bank_client_Train$y) #removing 'duration' and 'y' indices from train dataset.
client_m3_C50
## 
## Call:
## C5.0.default(x = bank_client_Train[c(-11, -21)], y = bank_client_Train$y)
## 
## Classification Tree
## Number of samples: 6496 
## Number of predictors: 19 
## 
## Tree size: 29 
## 
## Non-standard options: attempt to group attributes
plot(client_m3_C50)

summary(client_m3_C50)
## 
## Call:
## C5.0.default(x = bank_client_Train[c(-11, -21)], y = bank_client_Train$y)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Sun Jan 27 23:41:44 2019
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 6496 cases (20 attributes) from undefined.data
## 
## Decision tree:
## 
## nr.employed <= 5076.2: yes (1752/233)
## nr.employed > 5076.2:
## :...pdays <= 11: yes (75/8)
##     pdays > 11:
##     :...month = sep: no (0)
##         month in {apr,dec,mar,oct}:
##         :...default = unknown: no (39/12)
##         :   default = no:
##         :   :...cons.price.idx > 93.369: yes (30/1)
##         :       cons.price.idx <= 93.369:
##         :       :...day_of_week in {thu,tue,wed}: yes (332/64)
##         :           day_of_week in {fri,mon}:
##         :           :...previous > 0: no (29/6)
##         :               previous <= 0:
##         :               :...cons.conf.idx <= -49.5: yes (48/11)
##         :                   cons.conf.idx > -49.5:
##         :                   :...campaign <= 1: yes (54/16)
##         :                       campaign > 1:
##         :                       :...marital in {divorced,married,
##         :                           :           unknown}: no (44/11)
##         :                           marital = single: yes (18/7)
##         month in {aug,jul,jun,may,nov}:
##         :...euribor3m > 1.291: no (3602/1032)
##             euribor3m <= 1.291:
##             :...contact = telephone: no (29/4)
##                 contact = cellular:
##                 :...campaign > 5: no (34/7)
##                     campaign <= 5:
##                     :...job in {entrepreneur,management,self-employed,
##                         :       unemployed,unknown}: yes (48/20)
##                         job in {housemaid,retired}: no (14/5)
##                         job = admin.:
##                         :...education in {basic.4y,basic.9y,
##                         :   :             unknown}: yes (9)
##                         :   education in {basic.6y,high.school,illiterate,
##                         :                 professional.course,
##                         :                 university.degree}: no (105/48)
##                         job = services:
##                         :...previous <= 0: yes (32/9)
##                         :   previous > 0: no (20/7)
##                         job = student:
##                         :...campaign <= 2: no (10/3)
##                         :   campaign > 2: yes (4)
##                         job = blue-collar:
##                         :...loan in {unknown,yes}: no (13/3)
##                         :   loan = no:
##                         :   :...day_of_week in {fri,wed}: yes (51/19)
##                         :       day_of_week in {mon,thu,tue}: no (51/21)
##                         job = technician:
##                         :...campaign > 2: no (14/2)
##                             campaign <= 2:
##                             :...age <= 36: yes (23/7)
##                                 age > 36:
##                                 :...euribor3m <= 1.244: yes (4)
##                                     euribor3m > 1.244: no (12/2)
## 
## 
## Evaluation on training data (6496 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##      28 1558(24.0%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##    2853   395    (a): class no
##    1163  2085    (b): class yes
## 
## 
##  Attribute usage:
## 
##  100.00% nr.employed
##   73.03% pdays
##   71.88% month
##   62.73% euribor3m
##    9.65% day_of_week
##    9.14% default
##    8.62% campaign
##    8.54% cons.price.idx
##    7.28% contact
##    6.31% job
##    3.77% previous
##    2.52% cons.conf.idx
##    1.77% loan
##    1.75% education
##    0.95% marital
##    0.60% age
## 
## 
## Time: 0.0 secs

B. We now create a new prediction model with above changes and analyse it’s performance on Test and Train sets:

predicted_client_test3 <- predict(client_m3_C50, bank_client_Test)
mmetric(bank_client_Test$y, predicted_client_test3, metric = 'CONF')
## $res
## NULL
## 
## $conf
##       pred
## target   no  yes
##    no  1174  218
##    yes  505  887
## 
## $roc
## NULL
## 
## $lift
## NULL
mmetric(bank_client_Test$y, predicted_client_test3, metric = c('ACC', 'TPR', 'PRECISION', 'F1'))
##        ACC       TPR1       TPR2 PRECISION1 PRECISION2        F11 
##   74.03017   84.33908   63.72126   69.92257   80.27149   76.45718 
##        F12 
##   71.04525
predicted_client_train3 <- predict(client_m3_C50, bank_client_Train)
mmetric(bank_client_Train$y, predicted_client_train3, metric = 'CONF')
## $res
## NULL
## 
## $conf
##       pred
## target   no  yes
##    no  2853  395
##    yes 1163 2085
## 
## $roc
## NULL
## 
## $lift
## NULL
mmetric(bank_client_Train$y, predicted_client_train3, metric = c('ACC', 'TPR', 'PRECISION', 'F1'))
##        ACC       TPR1       TPR2 PRECISION1 PRECISION2        F11 
##   76.01601   87.83867   64.19335   71.04084   84.07258   78.55176 
##        F12 
##   72.80028

9. We will now simplify the Tree after removing the duration variable and predict again.

A.

client_m4_C50 <- C5.0(bank_client_Train[c(-11,-21)], bank_client_Train$y, control = C5.0Control(CF = 0.001))

client_m4_C50
## 
## Call:
## C5.0.default(x = bank_client_Train[c(-11, -21)], y =
##  bank_client_Train$y, control = C5.0Control(CF = 0.001))
## 
## Classification Tree
## Number of samples: 6496 
## Number of predictors: 19 
## 
## Tree size: 4 
## 
## Non-standard options: attempt to group attributes, confidence level: 0.001
plot(client_m4_C50)

summary(client_m4_C50)
## 
## Call:
## C5.0.default(x = bank_client_Train[c(-11, -21)], y =
##  bank_client_Train$y, control = C5.0Control(CF = 0.001))
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Sun Jan 27 23:41:46 2019
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 6496 cases (20 attributes) from undefined.data
## 
## Decision tree:
## 
## nr.employed <= 5076.2: yes (1752/233)
## nr.employed > 5076.2:
## :...pdays <= 11: yes (75/8)
##     pdays > 11:
##     :...month in {apr,dec,mar,oct}: yes (594/182)
##         month in {aug,jul,jun,may,nov,sep}: no (4075/1250)
## 
## 
## Evaluation on training data (6496 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##       4 1673(25.8%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##    2825   423    (a): class no
##    1250  1998    (b): class yes
## 
## 
##  Attribute usage:
## 
##  100.00% nr.employed
##   73.03% pdays
##   71.88% month
## 
## 
## Time: 0.0 secs

As seen above, we have a good simplified decision tree with only 4 nodes. Notice, we adjusted the CF.

B. Final Prediction model:

predicted_client_test4 <- predict(client_m4_C50, bank_client_Test)
mmetric(bank_client_Test$y, predicted_client_test4, metric = 'CONF')
## $res
## NULL
## 
## $conf
##       pred
## target   no  yes
##    no  1182  210
##    yes  518  874
## 
## $roc
## NULL
## 
## $lift
## NULL
mmetric(bank_client_Test$y, predicted_client_test4, metric = c('ACC', 'TPR', 'PRECISION', 'F1'))
##        ACC       TPR1       TPR2 PRECISION1 PRECISION2        F11 
##   73.85057   84.91379   62.78736   69.52941   80.62731   76.45537 
##        F12 
##   70.59774
predicted_client_train4 <- predict(client_m4_C50, bank_client_Train)
mmetric(bank_client_Train$y, predicted_client_train4, metric = 'CONF')
## $res
## NULL
## 
## $conf
##       pred
## target   no  yes
##    no  2825  423
##    yes 1250 1998
## 
## $roc
## NULL
## 
## $lift
## NULL
mmetric(bank_client_Train$y, predicted_client_train4, metric = c('ACC', 'TPR', 'PRECISION', 'F1'))
##        ACC       TPR1       TPR2 PRECISION1 PRECISION2        F11 
##   74.24569   86.97660   61.51478   69.32515   82.52788   77.15417 
##        F12 
##   70.48862

Conclusion: Initial extensive data exploration and mining gave us a good insight into our data and helped us with improving our modeling for better prediction. In general, if we increase pruning, the accuracy on the training set will be lower. By using the Train-Test split, we were possibly able to discover a “sweet spot” of the pruning confidence factor somewhere where it prunes enough to make the learned decision tree sufficiently accurate on test data, but doesn’t sacrifice too much accuracy on the training data. As seen from the last output, we can predict the conversion rate of a client i.e. if a client has subscribed to Certified term depoit with over 73% accuracy.